In [99]:
import pandas as pd
import polars as pl
import polars.selectors as cs
from catboost import Pool, CatBoostClassifier
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import tempfile
import os
import zipfile
import shap
import plotly.io as pio
import plotly as plotly
plotly.offline.init_notebook_mode()
pio.renderers.default = "vscode+notebook"
In [100]:
def read_dataset_export(file_name, src_folder=".",
tmp_folder=None,
lazy=False,
verbose=False):
json_file = None
error_reason = ""
tmp_folder = tmp_folder if tmp_folder else tempfile.gettempdir()
if file_name.endswith(".json"):
error_reason = "Error reading JSON file"
if os.path.exists(file_name):
json_file = file_name
elif os.path.exists(os.path.join(src_folder, file_name)):
json_file = os.path.join(src_folder, file_name)
if json_file and verbose:
print(error_reason, json_file)
if json_file:
if lazy:
multi_line_json = pl.scan_ndjson(json_file)
else:
multi_line_json = pl.read_ndjson(json_file)
else:
zip_file = file_name
if file_name.endswith(".zip"):
error_reason = "Error reading ZIP file"
if os.path.exists(file_name):
zip_file = file_name
elif os.path.exists(os.path.join(src_folder, file_name)):
zip_file = os.path.join(src_folder, file_name)
if verbose:
print(error_reason, zip_file)
if os.path.exists(zip_file):
error_reason = "Error extracting data.json"
if verbose:
print(error_reason, zip_file)
json_file = os.path.join(tmp_folder, "data.json")
if os.path.exists(json_file):
os.remove(json_file)
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
all_zip_entries = zip_ref.namelist()
json_file_in_zip = [s for s in all_zip_entries if "data.json" in s]
if verbose:
print("data.json in zip file:", json_file_in_zip, zip_file)
for file in json_file_in_zip:
zip_ref.extract(file, tmp_folder)
json_file = os.path.join(tmp_folder, file)
if not os.path.exists(json_file):
raise Exception(f"Dataset zipfile {zip_file} does not have \"data.json\"")
if lazy:
multi_line_json = pl.scan_ndjson(json_file, infer_schema_length=100000)
else:
multi_line_json = pl.read_ndjson(json_file, infer_schema_length=100000)
os.remove(json_file)
if json_file is None:
raise Exception(f"Dataset export not found {error_reason}")
return multi_line_json
Read and Pre-process data¶
In [101]:
df = read_dataset_export( "Web_ClickThrough.zip", lazy=True, verbose=True)
df.describe()
Error reading ZIP file Web_ClickThrough.zip Error extracting data.json Web_ClickThrough.zip data.json in zip file: ['data.json'] Web_ClickThrough.zip
Out[101]:
shape: (9, 81)
| statistic | pyModelEvidence | Decision_InteractionID | Customer_IsInArrears | Customer_IsProspect | IH_Retail_Inbound_Impression_pxLastGroupID | Customer_IsActiveMilitaryService | Customer_IsInCollections | Customer_IsStudent | IH_Web_Inbound_Impression_pyHistoricalOutcomeCount | Customer_IsBankruptcy | pyModelPerformance | Decision_DecisionTime | Customer_NumCreditCardAccount | Customer_DebtToIncomeRatio | Customer_IsInDisasterArea | Decision_Rank | Customer_IsCreditScoreStale | pyPropensity | Customer_NumDepositAccount | Customer_PrimaryState | Customer_IsInActiveComplaint | negativeSampling | Customer_ResidentialStatus | Customer_PrimaryMobilePhone | Customer_IsFinanciallyVulnerable | Context_Group | IH_Web_Inbound_Clicked_pxLastGroupID | Decision_OutcomeWeight | Param_Journey | Customer_IsB2C | id | Context_Issue | Customer_MKTCLVValue | Customer_NetWealth | Customer_TotalLiabilities | Customer_OrganizationName | … | Context_Name | Param_JourneyStage | Param_LastJourneyStage | Customer_IsInPrecollections | Customer_HasCriticalIllness | dataCenter | positiveSampling | IH_Web_Inbound_Impression_pxLastGroupID | IH_Web_Inbound_Clicked_pyHistoricalOutcomeCount | IH_Retail_Inbound_Impression_pyHistoricalOutcomeCount | Customer_OwnershipStatus | Decision_Outcome | Customer_AnnualIncome | Param_PriorStageInJourney | Context_Treatment | Customer_LinkedIn | Customer_Prefix | IH_Web_Inbound_Impression_pxLastOutcomeTime_DaysSince | IH_Retail_Inbound_Impression_pxLastOutcomeTime_DaysSince | Decision_SubjectID | Customer_PrimaryCountryCode | Customer_NumInvestmentAccount | Customer_PrimaryPostalCode | rulesetName | rulesetVersion | Customer_IsCustomerActive | Customer_IsIncarcerated | Customer_OrganizationID | pyModelPositives | Customer_NumLoanAccount | Customer_BirthDate | Customer_PrimaryCity | Param_DaysinCurrentStage | IH_Web_Inbound_Clicked_pxLastOutcomeTime_DaysSince | Context_Direction | Context_Channel | Customer_OwnedAccountTypes |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | f64 | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | … | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str |
| "count" | "32091" | "32091" | "32091" | "32091" | "162" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | 32091.0 | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | … | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "162" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "162" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "32091" | "9513" |
| "null_count" | "0" | "0" | "0" | "0" | "31929" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | 0.0 | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | … | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "31929" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "31929" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "0" | "22578" |
| "mean" | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | 0.599699 | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | … | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null |
| "std" | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | 0.271621 | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | … | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null |
| "min" | "0.0" | "-3042656458600205216" | "" | "" | "Creditcards" | "" | "" | "" | "10.0" | "" | "0.5" | "20250504T082914.693 GMT" | "" | "" | "" | "1.0" | "" | 0.127551 | "" | "" | "" | "100.0" | "" | "" | "" | "Creditcards" | "Creditcards" | "1.0" | "" | "" | "00005377-e1cc-560d-9d32-f04544… | "Grow" | "" | "" | "" | "" | … | "VisaClassic" | "" | "" | "" | "" | "datacenter1" | "100.0" | "Creditcards" | "10.0" | "1.0" | "" | "Clicked" | "" | "" | "Hero Web" | "" | "" | "0.000006041666666666667" | "22.910304131944443" | "C-000" | "" | "" | "" | "NBA-Artifacts" | "01-01-01" | "" | "" | "" | "0.0" | "" | "" | "" | "0.0" | "0.000005520833333333333" | "Inbound" | "Web" | "" |
| "25%" | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | 0.418575 | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | … | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null |
| "50%" | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | 0.570433 | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | … | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null |
| "75%" | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | 0.863868 | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | … | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null |
| "max" | "9996.0" | "-3042656458600399942" | "" | "" | "Creditcards" | "" | "false" | "true" | "9.0" | "" | "0.8475515053314687" | "20250505T084203.389 GMT" | "5.0" | "9.0" | "" | "1.0" | "" | 0.997062 | "2.0" | "" | "" | "100.0" | "" | "" | "" | "Creditcards" | "Creditcards" | "1.0" | "" | "" | "fffe469a-e880-5b2b-bc19-b281ee… | "Grow" | "" | "990953.0" | "987467.0" | "" | … | "VisaClassic" | "" | "" | "" | "" | "datacenter1" | "100.0" | "Creditcards" | "9.0" | "9.0" | "" | "NoResponse" | "99769.0" | "" | "Hero Web" | "" | "" | "1.8764207060185185" | "23.951360659722223" | "C-999" | "" | "1.0" | "" | "NBA-Artifacts" | "01-01-01" | "" | "" | "" | "9998.0" | "2.0" | "0.36253927083453164" | "" | "0.0" | "1.853330787037037" | "Inbound" | "Web" | "Loan, Loan, Loan, Loan" |
In [102]:
columns = df.collect_schema().names()
columns.sort()
columns
Out[102]:
['Context_Channel', 'Context_Direction', 'Context_Group', 'Context_Issue', 'Context_Name', 'Context_Treatment', 'Customer_AnnualIncome', 'Customer_BirthDate', 'Customer_CLV', 'Customer_CreditScore', 'Customer_DebtToIncomeRatio', 'Customer_HasBrokenPromise', 'Customer_HasCriticalIllness', 'Customer_IsActiveMilitaryService', 'Customer_IsB2C', 'Customer_IsBankruptcy', 'Customer_IsCreditScoreStale', 'Customer_IsCustomerActive', 'Customer_IsFinanciallyVulnerable', 'Customer_IsInActiveComplaint', 'Customer_IsInArrears', 'Customer_IsInCollections', 'Customer_IsInDisasterArea', 'Customer_IsInPrecollections', 'Customer_IsIncarcerated', 'Customer_IsProspect', 'Customer_IsStudent', 'Customer_LinkedIn', 'Customer_MKTCLVValue', 'Customer_NetWealth', 'Customer_NumCreditCardAccount', 'Customer_NumDepositAccount', 'Customer_NumInvestmentAccount', 'Customer_NumLoanAccount', 'Customer_OrganizationID', 'Customer_OrganizationName', 'Customer_OwnedAccountTypes', 'Customer_OwnershipStatus', 'Customer_Prefix', 'Customer_PrimaryCity', 'Customer_PrimaryCountry', 'Customer_PrimaryCountryCode', 'Customer_PrimaryMobilePhone', 'Customer_PrimaryPostalCode', 'Customer_PrimaryState', 'Customer_RelationshipLengthDays', 'Customer_ResidentialStatus', 'Customer_TotalAssets', 'Customer_TotalLiabilities', 'Decision_DecisionTime', 'Decision_InteractionID', 'Decision_Outcome', 'Decision_OutcomeTime', 'Decision_OutcomeWeight', 'Decision_Rank', 'Decision_SubjectID', 'IH_Retail_Inbound_Impression_pxLastGroupID', 'IH_Retail_Inbound_Impression_pxLastOutcomeTime_DaysSince', 'IH_Retail_Inbound_Impression_pyHistoricalOutcomeCount', 'IH_Web_Inbound_Clicked_pxLastGroupID', 'IH_Web_Inbound_Clicked_pxLastOutcomeTime_DaysSince', 'IH_Web_Inbound_Clicked_pyHistoricalOutcomeCount', 'IH_Web_Inbound_Impression_pxLastGroupID', 'IH_Web_Inbound_Impression_pxLastOutcomeTime_DaysSince', 'IH_Web_Inbound_Impression_pyHistoricalOutcomeCount', 'Param_DaysinCurrentStage', 'Param_Journey', 'Param_JourneyStage', 'Param_LastJourneyStage', 'Param_PriorStageInJourney', 'dataCenter', 'id', 'negativeSampling', 'positiveSampling', 'pyModelEvidence', 'pyModelPerformance', 'pyModelPositives', 'pyPropensity', 'rulesetName', 'rulesetVersion']
In [103]:
df = df.unique(subset=['Decision_InteractionID', 'Context_Treatment'], keep='last')
In [104]:
df = df.with_columns(
pl.when(pl.col(pl.String).str.len_chars() == 0)
.then(None)
.otherwise(pl.col(pl.String))
.name.keep()
).with_columns(
cs.ends_with("_DaysSince",
"_pyHistoricalOutcomeCount",
"DaysinCurrentStage")
.cast(pl.Float64).fill_null(0),
pl.col(
[
"Customer_AnnualIncome",
"Customer_CreditScore",
"Customer_DebtToIncomeRatio",
"Customer_NetWealth",
"Customer_RelationshipLengthDays",
"Customer_TotalAssets",
"Customer_TotalLiabilities",
"Customer_BirthDate"
]
)
.cast(pl.Float64)
.fill_null(0),
cs.starts_with("Customer_Num").cast(pl.Float64).fill_null(0),
cs.starts_with("Context_").cast(pl.String),
cs.starts_with("Customer_Is").replace_strict({"false":False, "true":True, "null":False, "False":False, "True":True}),
cs.starts_with("Customer_Has").replace_strict({"false":False, "true":True, "null":False, "False":False, "True":True})
).with_columns(
cs.starts_with("Customer_Is").fill_null(False).cast(pl.Boolean),
cs.starts_with("Customer_Has").fill_null(False).cast(pl.Boolean)
).with_columns(
pl.col(
[
"Customer_AnnualIncome",
"Customer_CreditScore",
"Customer_DebtToIncomeRatio",
"Customer_NetWealth",
"Customer_RelationshipLengthDays",
"Customer_TotalAssets",
"Customer_TotalLiabilities"
]
).cast(pl.Float64).fill_null(0),
)
In [105]:
df = df.drop(["rulesetVersion", "id", "dataCenter", "negativeSampling", "positiveSampling", "rulesetName",
"Decision_SubjectID", "Decision_OutcomeTime", "Decision_Rank", "Decision_InteractionID",
"Decision_DecisionTime", "Decision_OutcomeWeight", "pyModelEvidence", "pyModelPerformance",
"pyModelPositives", "pyPropensity", "rulesetVersion"])
In [106]:
cat_features = list()
schema = df.collect_schema()
for cname in schema.names():
ctype = schema[cname]
if(not(cname.startswith("Decision_")) and pl.String.is_(ctype)):
df = df.with_columns(pl.col(cname).fill_null('N/A'))
cat_features.append(cname)
print(cat_features)
['IH_Retail_Inbound_Impression_pxLastGroupID', 'Customer_PrimaryState', 'Customer_ResidentialStatus', 'Customer_PrimaryMobilePhone', 'Context_Group', 'IH_Web_Inbound_Clicked_pxLastGroupID', 'Param_Journey', 'Context_Issue', 'Customer_MKTCLVValue', 'Customer_OrganizationName', 'Customer_CLV', 'Customer_PrimaryCountry', 'Context_Name', 'Param_JourneyStage', 'Param_LastJourneyStage', 'IH_Web_Inbound_Impression_pxLastGroupID', 'Customer_OwnershipStatus', 'Param_PriorStageInJourney', 'Context_Treatment', 'Customer_LinkedIn', 'Customer_Prefix', 'Customer_PrimaryCountryCode', 'Customer_PrimaryPostalCode', 'Customer_OrganizationID', 'Customer_PrimaryCity', 'Context_Direction', 'Context_Channel', 'Customer_OwnedAccountTypes']
In [107]:
text_processing_options = {
"tokenizers": [{
"tokenizer_id": "comma",
"delimiter": ",",
"lowercasing": "true"
}],
"dictionaries": [{
"dictionary_id": "Word",
"gram_order": "1"
}],
"feature_processing": {
"default": [{
"dictionaries_names": ["Word"],
"feature_calcers": ["BoW"],
"tokenizers_names": ["comma"]
}]
}
}
text_features = ['Customer_OwnedAccountTypes']
In [108]:
cat_features = list(set(cat_features) - set(text_features))
In [109]:
df = df.collect()
df.head()
Out[109]:
shape: (5, 64)
| Customer_IsInArrears | Customer_IsProspect | IH_Retail_Inbound_Impression_pxLastGroupID | Customer_IsActiveMilitaryService | Customer_IsInCollections | Customer_IsStudent | IH_Web_Inbound_Impression_pyHistoricalOutcomeCount | Customer_IsBankruptcy | Customer_NumCreditCardAccount | Customer_DebtToIncomeRatio | Customer_IsInDisasterArea | Customer_IsCreditScoreStale | Customer_NumDepositAccount | Customer_PrimaryState | Customer_IsInActiveComplaint | Customer_ResidentialStatus | Customer_PrimaryMobilePhone | Customer_IsFinanciallyVulnerable | Context_Group | IH_Web_Inbound_Clicked_pxLastGroupID | Param_Journey | Customer_IsB2C | Context_Issue | Customer_MKTCLVValue | Customer_NetWealth | Customer_TotalLiabilities | Customer_OrganizationName | Customer_HasBrokenPromise | Customer_CLV | Customer_RelationshipLengthDays | Customer_PrimaryCountry | Customer_TotalAssets | Customer_CreditScore | Context_Name | Param_JourneyStage | Param_LastJourneyStage | Customer_IsInPrecollections | Customer_HasCriticalIllness | IH_Web_Inbound_Impression_pxLastGroupID | IH_Web_Inbound_Clicked_pyHistoricalOutcomeCount | IH_Retail_Inbound_Impression_pyHistoricalOutcomeCount | Customer_OwnershipStatus | Decision_Outcome | Customer_AnnualIncome | Param_PriorStageInJourney | Context_Treatment | Customer_LinkedIn | Customer_Prefix | IH_Web_Inbound_Impression_pxLastOutcomeTime_DaysSince | IH_Retail_Inbound_Impression_pxLastOutcomeTime_DaysSince | Customer_PrimaryCountryCode | Customer_NumInvestmentAccount | Customer_PrimaryPostalCode | Customer_IsCustomerActive | Customer_IsIncarcerated | Customer_OrganizationID | Customer_NumLoanAccount | Customer_BirthDate | Customer_PrimaryCity | Param_DaysinCurrentStage | IH_Web_Inbound_Clicked_pxLastOutcomeTime_DaysSince | Context_Direction | Context_Channel | Customer_OwnedAccountTypes |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| bool | bool | str | bool | bool | bool | f64 | bool | f64 | f64 | bool | bool | f64 | str | bool | str | str | bool | str | str | str | bool | str | str | f64 | f64 | str | bool | str | f64 | str | f64 | f64 | str | str | str | bool | bool | str | f64 | f64 | str | str | f64 | str | str | str | str | f64 | f64 | str | f64 | str | bool | bool | str | f64 | f64 | str | f64 | f64 | str | str | str |
| false | false | "N/A" | false | false | false | 13.0 | false | 2.0 | 32.0 | false | false | 2.0 | "N/A" | false | "N/A" | "N/A" | false | "Creditcards" | "Creditcards" | "N/A" | false | "Grow" | "N/A" | 0.0 | 0.0 | "N/A" | false | "High" | 2468.0 | "N/A" | 0.0 | 9.0 | "VisaClassic" | "N/A" | "N/A" | false | false | "Creditcards" | 17.0 | 0.0 | "N/A" | "Clicked" | 0.0 | "N/A" | "Hero Web" | "N/A" | "N/A" | 0.040267 | 0.0 | "N/A" | 1.0 | "N/A" | false | false | "N/A" | 1.0 | 0.0 | "N/A" | 0.0 | 0.001257 | "Inbound" | "Web" | "N/A" |
| false | false | "N/A" | false | false | false | 25.0 | false | 4.0 | 36.0 | false | false | 2.0 | "N/A" | false | "N/A" | "N/A" | false | "Creditcards" | "Creditcards" | "N/A" | false | "Grow" | "N/A" | 4.05488e6 | 447574.0 | "N/A" | false | "Loyal" | 3800.0 | "N/A" | 1.990639e6 | 4.0 | "VisaClassic" | "N/A" | "N/A" | false | false | "Creditcards" | 32.0 | 0.0 | "N/A" | "Clicked" | 496508.0 | "N/A" | "Hero Web" | "N/A" | "N/A" | 0.005005 | 0.0 | "N/A" | 1.0 | "N/A" | false | false | "N/A" | 0.0 | 0.33066 | "N/A" | 0.0 | 0.005339 | "Inbound" | "Web" | "N/A" |
| false | false | "N/A" | false | false | false | 19.0 | false | 0.0 | 2.0 | false | false | 0.0 | "N/A" | false | "N/A" | "N/A" | false | "Creditcards" | "Creditcards" | "N/A" | false | "Grow" | "N/A" | 0.0 | 0.0 | "N/A" | false | "N/A" | 303.0 | "N/A" | 0.0 | 0.0 | "VisaClassic" | "N/A" | "N/A" | false | false | "Creditcards" | 14.0 | 0.0 | "N/A" | "Clicked" | 0.0 | "N/A" | "Hero Web" | "N/A" | "N/A" | 0.002063 | 0.0 | "N/A" | 0.0 | "N/A" | false | false | "N/A" | 0.0 | 0.0 | "N/A" | 0.0 | 0.007697 | "Inbound" | "Web" | "N/A" |
| false | false | "N/A" | false | false | true | 19.0 | false | 2.0 | 25.0 | false | false | 0.0 | "N/A" | false | "N/A" | "N/A" | false | "Creditcards" | "Creditcards" | "N/A" | false | "Grow" | "N/A" | 0.0 | 0.0 | "N/A" | false | "High" | 2053.0 | "N/A" | 0.0 | 8.0 | "VisaClassic" | "N/A" | "N/A" | false | false | "Creditcards" | 29.0 | 0.0 | "N/A" | "NoResponse" | 0.0 | "N/A" | "Hero Web" | "N/A" | "N/A" | 0.209814 | 0.0 | "N/A" | 0.0 | "N/A" | false | false | "N/A" | 2.0 | 0.0 | "N/A" | 0.0 | 0.008526 | "Inbound" | "Web" | "N/A" |
| false | false | "N/A" | false | false | false | 29.0 | false | 5.0 | 10.0 | false | false | 0.0 | "N/A" | false | "N/A" | "N/A" | false | "Creditcards" | "Creditcards" | "N/A" | false | "Grow" | "N/A" | 0.0 | 0.0 | "N/A" | false | "High" | 1563.0 | "N/A" | 0.0 | 2.0 | "VisaClassic" | "N/A" | "N/A" | false | false | "Creditcards" | 13.0 | 0.0 | "N/A" | "Clicked" | 0.0 | "N/A" | "Hero Web" | "N/A" | "N/A" | 0.000302 | 0.0 | "N/A" | 0.0 | "N/A" | false | false | "N/A" | 1.0 | 0.0 | "N/A" | 0.0 | 0.003019 | "Inbound" | "Web" | "N/A" |
Train Model¶
- Automatic handling of categorical features without manual encoding
- Built‑in text processing (those properties cannot be used by ADM currently)
XGB model is trained to understand which features perform well and which not, when compared to ootb NB ADM. Text processing can be used to enhance model performance (in this example one property contains CSV lists, which cannot be processed by ADM, but, if this analysis reveals them as powerful predictors, can be further encoded and added to ADM models as predictors).
In [110]:
dset = df.to_pandas()
y = dset['Decision_Outcome']
X = dset.drop(['Decision_Outcome'], axis=1)
seed = 127
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.1, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(
X_train, y_train, test_size=test_size, random_state=seed)
In [111]:
params = {'loss_function': 'Logloss', # objective function
'eval_metric': 'AUC', # metric
'verbose': 50, # output to stdout info about training process every 50 iterations
'random_seed': seed,
'cat_features': cat_features,
'text_features': text_features,
'text_processing': text_processing_options,
'one_hot_max_size': 1023,
'class_names': ['NoResponse', 'Clicked'],
'iterations': 100,
'learning_rate': 0.5,
'depth': 8
}
In [112]:
%%time
cbc_1 = CatBoostClassifier(**params)
cbc_1.fit(X=X_train, y=y_train, # data to train on (required parameters, unless we provide X as a pool object, will be shown below)
eval_set=(X_val, y_val), # data to validate on
# True if we don't want to save trees created after iteration with the best validation score
use_best_model=True,
# True for visualization of the training process (it is not shown in a published kernel - try executing this code)
plot=True
)
MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))
0: test: 0.8568634 best: 0.8568634 (0) total: 8.22ms remaining: 814ms 50: test: 0.8808364 best: 0.8851585 (8) total: 330ms remaining: 317ms 99: test: 0.8747479 best: 0.8851585 (8) total: 619ms remaining: 0us bestTest = 0.8851585395 bestIteration = 8 Shrink model to first 9 iterations. CPU times: user 2.78 s, sys: 697 ms, total: 3.48 s Wall time: 713 ms
Out[112]:
<catboost.core.CatBoostClassifier at 0x16e4c1350>
Review Model Parameters¶
In [113]:
pool = Pool(X_test, y_test, cat_features=cat_features, text_features=text_features)
#pool = Pool(X_test, y_test, cat_features=cat_features)
In [114]:
cbc_1.get_all_params()
Out[114]:
{'nan_mode': 'Min',
'eval_metric': 'AUC',
'combinations_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=1:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1',
'Counter:CtrBorderCount=15:CtrBorderType=Uniform:Prior=0/1'],
'iterations': 100,
'sampling_frequency': 'PerTree',
'fold_permutation_block': 0,
'leaf_estimation_method': 'Newton',
'random_score_type': 'NormalWithModelSizeDecrease',
'counter_calc_method': 'SkipTest',
'grow_policy': 'SymmetricTree',
'penalties_coefficient': 1,
'boosting_type': 'Plain',
'model_shrink_mode': 'Constant',
'feature_border_type': 'GreedyLogSum',
'ctr_leaf_count_limit': 18446744073709551615,
'bayesian_matrix_reg': 0.10000000149011612,
'one_hot_max_size': 1023,
'eval_fraction': 0,
'force_unit_auto_pair_weights': False,
'l2_leaf_reg': 3,
'random_strength': 1,
'rsm': 1,
'boost_from_average': False,
'max_ctr_complexity': 1,
'model_size_reg': 0.5,
'simple_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=1:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1',
'Counter:CtrBorderCount=15:CtrBorderType=Uniform:Prior=0/1'],
'pool_metainfo_options': {'tags': {}},
'subsample': 0.800000011920929,
'use_best_model': True,
'class_names': ['NoResponse', 'Clicked'],
'random_seed': 127,
'depth': 8,
'ctr_target_border_count': 1,
'posterior_sampling': False,
'has_time': False,
'store_all_simple_ctr': False,
'border_count': 254,
'classes_count': 0,
'auto_class_weights': 'None',
'sparse_features_conflict_fraction': 0,
'leaf_estimation_backtracking': 'AnyImprovement',
'best_model_min_trees': 1,
'model_shrink_rate': 0,
'min_data_in_leaf': 1,
'text_processing': {'dictionaries': [{'start_token_id': '0',
'occurrence_lower_bound': '5',
'skip_step': '0',
'end_of_word_token_policy': 'Insert',
'token_level_type': 'Word',
'end_of_sentence_token_policy': 'Skip',
'gram_order': '1',
'max_dictionary_size': '50000',
'dictionary_id': 'Word'}],
'feature_processing': {'default': [{'dictionaries_names': ['Word'],
'feature_calcers': ['BoW:calcer_type="BoW"'],
'tokenizers_names': ['comma']}]},
'tokenizers': [{'number_token': '🔢',
'skip_empty': '1',
'number_process_policy': 'LeaveAsIs',
'tokenizer_id': 'comma',
'token_types': ['Number', 'Unknown', 'Word'],
'delimiter': ',',
'languages': [],
'lemmatizing': '0',
'split_by_set': '0',
'lowercasing': '1',
'subtokens_policy': 'SingleToken',
'separator_type': 'ByDelimiter'}]},
'loss_function': 'Logloss',
'learning_rate': 0.5,
'score_function': 'Cosine',
'task_type': 'CPU',
'leaf_estimation_iterations': 10,
'bootstrap_type': 'MVS',
'max_leaves': 256,
'permutation_count': 4}
In [115]:
cbc_1.plot_tree(
tree_idx=1,
pool=pool
)
Out[115]:
In [116]:
feature_importance = cbc_1.get_feature_importance(data=pool,
prettified=True,
verbose=True, type="PredictionValuesChange")
feature_importance
Used dataset leave statistics for fstr calculation
Out[116]:
| Feature Id | Importances | |
|---|---|---|
| 0 | Customer_CLV | 36.866382 |
| 1 | Customer_CreditScore | 19.152898 |
| 2 | Customer_DebtToIncomeRatio | 17.961440 |
| 3 | Customer_RelationshipLengthDays | 9.572969 |
| 4 | IH_Web_Inbound_Impression_pxLastOutcomeTime_Da... | 3.770789 |
| ... | ... | ... |
| 58 | Customer_OrganizationID | 0.000000 |
| 59 | Customer_PrimaryCity | 0.000000 |
| 60 | Param_DaysinCurrentStage | 0.000000 |
| 61 | Context_Direction | 0.000000 |
| 62 | Context_Channel | 0.000000 |
63 rows × 2 columns
In [117]:
feature_importance = cbc_1.get_feature_importance(data=pool,
prettified=True,
verbose=True, type="LossFunctionChange")
feature_importance
Used Logloss metric for fstr calculation Selected 3210 documents from 3210 for LossFunctionChange calculation. Used Logloss metric for fstr calculation Started LossFunctionChange calculation 3210/3210 Process documents passed time: 6.72ms remaining time: 0us
Out[117]:
| Feature Id | Importances | |
|---|---|---|
| 0 | Customer_OwnedAccountTypes | 0.147050 |
| 1 | Customer_CreditScore | 0.060189 |
| 2 | Customer_DebtToIncomeRatio | 0.054758 |
| 3 | Customer_RelationshipLengthDays | 0.030252 |
| 4 | IH_Web_Inbound_Clicked_pxLastOutcomeTime_DaysS... | 0.005299 |
| ... | ... | ... |
| 58 | IH_Web_Inbound_Impression_pyHistoricalOutcomeC... | -0.000293 |
| 59 | Customer_NumCreditCardAccount | -0.000321 |
| 60 | Customer_TotalAssets | -0.000521 |
| 61 | Customer_NumLoanAccount | -0.000533 |
| 62 | Customer_BirthDate | -0.000736 |
63 rows × 2 columns
In [118]:
# make the prediction using the resulting model
preds = cbc_1.predict(pool)
preds_proba = cbc_1.predict_proba(pool)
print(preds_proba[:5])
print(cbc_1.predict(pool, 'RawFormulaVal')[:5])
[[0.99580922 0.00419078] [0.96839389 0.03160611] [0.11126779 0.88873221] [0.61395444 0.38604556] [0.99759843 0.00240157]] [-5.47066774 -3.42228835 2.07785618 -0.46396535 -6.02922831]
In [119]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test, preds, labels=params.get('class_names')))
print(metrics.classification_report(
y_test, preds, labels=params.get('class_names')))
[[ 945 292]
[ 377 1596]]
precision recall f1-score support
NoResponse 0.71 0.76 0.74 1237
Clicked 0.85 0.81 0.83 1973
accuracy 0.79 3210
macro avg 0.78 0.79 0.78 3210
weighted avg 0.80 0.79 0.79 3210
In [120]:
from catboost.utils import get_roc_curve
from sklearn.metrics import auc
curve = get_roc_curve(cbc_1, pool)
(fpr, tpr, thresholds) = curve
roc_auc = auc(fpr, tpr)
import matplotlib.pyplot as plt
plt.figure(figsize=(16, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc, alpha=0.5)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--', alpha=0.5)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(True)
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('Receiver operating characteristic', fontsize=20)
plt.legend(loc="lower right", fontsize=16)
plt.show()
In [121]:
print('error:', 1-np.mean(preds == np.ravel(y_test)))
error: 0.20841121495327097
In [122]:
rmse_learn = pd.read_csv(
'catboost_info/learn_error.tsv', header=0, delimiter='\t')
rmse_test = pd.read_csv('catboost_info/test_error.tsv',
header=0, delimiter='\t')
plt.plot(rmse_learn['Logloss'], label="Learn Error")
plt.plot(rmse_test['Logloss'], label="Test Error")
Out[122]:
[<matplotlib.lines.Line2D at 0x310ee2e90>]
Model Analysis¶
In [123]:
shap.initjs()
In [124]:
shap_values = cbc_1.get_feature_importance(pool, type="ShapValues")
In [125]:
expected_value = shap_values[0, -1]
shap_values = shap_values[:, :-1]
In [126]:
shap.summary_plot(shap_values, X_test, max_display=20, plot_size=[14,10])
In [127]:
shap.summary_plot(shap_values, X_test, plot_type="bar", plot_size=[14,10])
Prediction Explanations¶
In [128]:
shap.plots.force(expected_value, shap_values[50], feature_names=X_test.columns)
Out[128]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Individual Feature Analysis¶
In [129]:
feature = ['Customer_DebtToIncomeRatio', 'Customer_CreditScore', 'Customer_CLV', 'Customer_RelationshipLengthDays']
res = cbc_1.calc_feature_statistics(X_test, y_test, feature, plot=True)
Analyse Model Without Text Features¶
In [130]:
dset = df.to_pandas()
y = dset['Decision_Outcome']
X = dset.drop(['Decision_Outcome'] + text_features, axis=1)
seed = 127
test_size = 0.2
X_train2, X_test2, y_train2, y_test2 = train_test_split(
X, y, test_size=0.1, random_state=seed)
X_train2, X_val2, y_train2, y_val2 = train_test_split(
X_train2, y_train2, test_size=test_size, random_state=seed)
In [131]:
params = {'loss_function': 'Logloss', # objective function
'eval_metric': 'AUC', # metric
'verbose': 50, # output to stdout info about training process every 50 iterations
'random_seed': seed,
'cat_features': cat_features,
'class_names': ['NoResponse', 'Clicked'],
'iterations': 100,
'learning_rate': 0.5,
'depth': 8
}
In [132]:
%%time
cbc_2 = CatBoostClassifier(**params)
cbc_2.fit(X=X_train2, y=y_train2, # data to train on (required parameters, unless we provide X as a pool object, will be shown below)
eval_set=(X_val2, y_val2), # data to validate on
# True if we don't want to save trees created after iteration with the best validation score
use_best_model=True,
# True for visualization of the training process (it is not shown in a published kernel - try executing this code)
plot=True
)
MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))
0: test: 0.8241185 best: 0.8241185 (0) total: 10.8ms remaining: 1.06s 50: test: 0.8816126 best: 0.8857634 (23) total: 429ms remaining: 412ms 99: test: 0.8747540 best: 0.8857634 (23) total: 821ms remaining: 0us bestTest = 0.8857633624 bestIteration = 23 Shrink model to first 24 iterations. CPU times: user 4.69 s, sys: 831 ms, total: 5.52 s Wall time: 935 ms
Out[132]:
<catboost.core.CatBoostClassifier at 0x303e1fad0>
In [133]:
def print_score_diff(first_model, second_model):
first_accuracy = first_model.best_score_['validation']['AUC']
second_accuracy = second_model.best_score_['validation']['AUC']
gap = (second_accuracy - first_accuracy) / first_accuracy * 100
print('{} vs {} ({:+.2f}%)'.format(first_accuracy, second_accuracy, gap))
print('Model AUC difference - without text features vs with text features.')
print_score_diff(cbc_2, cbc_1)
Model AUC difference - without text features vs with text features. 0.8857633624228525 vs 0.8851585394538517 (-0.07%)
In [134]:
explainer = shap.TreeExplainer(cbc_2)
shap_values_exp = explainer(X_test2)
In [135]:
shap.plots.bar(shap_values_exp)
In [136]:
shap.plots.beeswarm(shap_values_exp)
In [137]:
shap.plots.force(explainer(X_test2.sample(n=500, random_state=seed)))
Out[137]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [138]:
shap.dependence_plot("Customer_CLV", shap_values_exp.values, X_test2, interaction_index="Customer_DebtToIncomeRatio")
Individual Predition Explanation¶
In [139]:
shap.plots.force(shap_values_exp[8])
Out[139]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [140]:
shap.plots.waterfall(shap_values_exp[8])
In [141]:
preds_proba = cbc_2.predict_proba(X_test2.iloc[8])
print(preds_proba)
[0.00331373 0.99668627]
In [142]:
shap.decision_plot(
base_value=np.array([explainer.expected_value]),
shap_values=explainer.shap_values(X_test2)[8],
features=X_test2.columns
)
Feature dependency¶
In [143]:
feature = 'Customer_CLV'
shap.plots.scatter(shap_values_exp[:, feature], color=shap_values_exp[:, "Customer_CreditScore"])
Using global feature importance orderings¶
In [144]:
shap.plots.scatter(shap_values_exp[:, shap_values_exp.abs.mean(0).argsort[-1]], alpha=0.2)
Model Calibration Quality¶
In [145]:
import numpy as np
# Calibration curves
def calibration(groundtruth, probs):
# Convert groundtruth to binary and ensure probabilities are in a DataFrame
groundtruth_binary = 1*np.array(groundtruth)
nlabels = len(np.unique(groundtruth))
if nlabels < 2:
return pl.DataFrame({
"MeanProbs": [0.5],
"PositivesShare": [None],
"binPos": [None],
"binNeg": [None]
})
if nlabels > 2:
raise ValueError("'groundtruth' has more than two levels.")
# Create probabilities DataFrame with binning
probabilities = pl.DataFrame({
"groundtruth": groundtruth_binary,
"probs": probs
})
# Group and summarize probabilities
grouped_probabilities = (probabilities
.with_columns((pl.col("probs") * 10).round().alias("bin")) # Binning probs to 1 decimal place
.group_by("bin")
.agg([
pl.mean("probs").alias("MeanProbs"),
pl.sum("groundtruth").alias("binPos"),
(pl.count("groundtruth") - pl.sum("groundtruth")).alias("binNeg"),
(pl.sum("groundtruth") / pl.count("groundtruth")).alias("PositivesShare")
])
.sort("bin"))
return grouped_probabilities
In [146]:
y_test_bin = y_test.apply(lambda x: x == 'Clicked')
preds_proba = cbc_1.predict_proba(X_test)
calibration_data = calibration(y_test_bin, preds_proba[:,1])
In [147]:
import plotly.express as px
import plotly.graph_objects as go
fig = px.line(calibration_data,
x="MeanProbs",
y="PositivesShare")
# Add ideal calibration line (diagonal)
fig.add_shape(type="line", line=dict(dash='dash', color="darkred"), row='all', col='all', x0=0, y0=0, x1=1, y1=1)
# Customize the layout and labels
fig.update_layout(
title="Model calibration plot",
xaxis_title="Mean predicted probability",
yaxis_title="Fraction of positives"
)
fig.show()